from pyspark.sql import SparkSession,DataFrame

class GenresByAverageRating:

    def run(self,moviesDataSet:DataFrame,ratingsDataSet:DataFrame,spark:SparkSession):

        # 将dataframe注册成视图
        moviesDataSet.createOrReplaceTempView("movies")
        ratingsDataSet.createOrReplaceTempView("ratings")

        # 分割字符串，使用|需要\\|，如果在双引号中，则是\\\\|

        sql="""
            with explode_movie as(
                select movieId,title,category
                from movies lateral view explode(split(genres,"\\\\|")) temp as category
            )
            select a.category as genres,avg(b.rating) as avgRating
            from explode_movie a inner join ratings b on a.movieId=b.movieId
            group by a.category
        """
        resultDS=spark.sql(sql)
        return resultDS
